http://kdd.ics.uci.edu/databases/kddcup99/task.html
Here is a paper that analyzes the dataset https://web.cs.dal.ca/~zincir/bildiri/pst05-gnm.pdf
In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, linear_model, cluster, \
preprocessing, metrics, pipeline, tree, ensemble, decomposition
pd.options.display.max_columns = 1000
%matplotlib inline
In [6]:
num_cluster = 30
In [7]:
columns = [f.split(":")[0] for f in """
duration: continuous.
protocol_type: symbolic.
service: symbolic.
flag: symbolic.
src_bytes: continuous.
dst_bytes: continuous.
land: symbolic.
wrong_fragment: continuous.
urgent: continuous.
hot: continuous.
num_failed_logins: continuous.
logged_in: symbolic.
num_compromised: continuous.
root_shell: continuous.
su_attempted: continuous.
num_root: continuous.
num_file_creations: continuous.
num_shells: continuous.
num_access_files: continuous.
num_outbound_cmds: continuous.
is_host_login: symbolic.
is_guest_login: symbolic.
count: continuous.
srv_count: continuous.
serror_rate: continuous.
srv_serror_rate: continuous.
rerror_rate: continuous.
srv_rerror_rate: continuous.
same_srv_rate: continuous.
diff_srv_rate: continuous.
srv_diff_host_rate: continuous.
dst_host_count: continuous.
dst_host_srv_count: continuous.
dst_host_same_srv_rate: continuous.
dst_host_diff_srv_rate: continuous.
dst_host_same_src_port_rate: continuous.
dst_host_srv_diff_host_rate: continuous.
dst_host_serror_rate: continuous.
dst_host_srv_serror_rate: continuous.
dst_host_rerror_rate: continuous.
dst_host_srv_rerror_rate: continuous.
""".split("\n") if len(f)>0]
columns.append("Category")
print(columns)
In [8]:
df = pd.read_csv("/data/kddcup.data", header=None, names=columns)
In [9]:
df.head()
Out[9]:
In [10]:
df.Category.value_counts()
Out[10]:
Attacks fall into one of four categories: User to Root; Remote to Local; Denial of Service; and Probe.
Mapping is below.
In [11]:
attack_types = {
'normal.': "normal",
'buffer_overflow.':'u2r',
'loadmodule.':'u2r',
'perl.':'u2r',
'neptune.':'dos',
'smurf.':'dos',
'guess_passwd.':'r2l',
'pod.': 'dos',
'teardrop.':'dos',
'portsweep.':'probe',
'ipsweep.':'probe',
'land.':'dos',
'ftp_write.':'r2l',
'back.': 'dos',
'imap.': 'r2l',
'satan.': 'probe',
'phf.':'r2l',
'nmap.':'probe',
'multihop.':'r2l',
'warezmaster.':'r2l',
'warezclient.':'r2l',
'spy.':'r2l',
'rootkit.':'u2r'}
In [12]:
df["label"] = np.where(df.Category == "normal.", "normal", "attack")
df["attack_type"] = df.Category.apply(lambda r: attack_types[r])
In [13]:
df.label.value_counts()/df.shape[0]
Out[13]:
In [14]:
df.attack_type.value_counts(dropna=False)
Out[14]:
In [15]:
df_num = df.select_dtypes(include=[np.float64, np.int64])
df_num.head()
Out[15]:
In [16]:
X = preprocessing.StandardScaler().fit_transform(df_num)
In [235]:
%%time
def display_2d(X, n_samples = 10000):
pca = decomposition.PCA(n_components=2)
pca_values = pca.fit_transform(X)
X_pca = pca_values.copy()
X_pca = pd.DataFrame(X_pca)
X_pca["color"] = np.where(labels == "attack", "red", "green")
X_sample = X_pca.sample(n_samples)
colors = X_sample.color
X_sample.plot.scatter(0, 1, color = colors)
return pca_values
X_pca = display_2d(X)
In [237]:
%%time
y = preprocessing.LabelEncoder().fit_transform(df.label)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X_pca, y, test_size = 0.3, random_state = 1)
est = tree.DecisionTreeClassifier(max_depth=5)
est.fit(X_train, y_train)
print("Accuracy:", est.score(X_test, y_test))
In [242]:
est.feature_importances_
Out[242]:
In [39]:
pca = decomposition.PCA()
pca.fit(X)
Out[39]:
In [43]:
_, ax = plt.subplots(figsize = (10, 6))
pd.Series(pca.explained_variance_ratio_).plot.bar(ax = ax)
pd.Series(np.cumsum(pca.explained_variance_ratio_)).plot.line(ax = ax)
Out[43]:
In [171]:
pd.DataFrame({"cumsum": np.cumsum(pca.explained_variance_ratio_)}).query("cumsum>=0.99").head()
Out[171]:
In [18]:
%%time
pca = decomposition.PCA(n_components=25)
X_pca = pca.fit_transform(X)
In [19]:
%%time
kmeans = cluster.MiniBatchKMeans(n_clusters=num_cluster)
y_cluster = kmeans.fit_predict(X_pca)
In [20]:
pd.Series(y_cluster).value_counts()
Out[20]:
In [224]:
for i in range(num_cluster):
print("Cluster: ", i, "")
print(pd.Series(df.Category[y_cluster == i]).value_counts())
print("\n")
In [178]:
distances = np.zeros([df.shape[0]])
for i in range(num_cluster):
centroid = kmeans.cluster_centers_[i]
distances[y_cluster==i] = np.sqrt(np.sum((X_pca[y_cluster==i] - centroid)**2, axis = 1))
np.sort(distances)[::-1][:100]
Out[178]:
In [180]:
np.sum(distances ** 2), kmeans.inertia_
Out[180]:
In [ ]:
Average distance of a point to its closest centroid to within each cluster
In [201]:
cluster_avg_distances = []
for i in range(num_cluster):
cluster_avg_distances.append(np.mean(distances[y_cluster == i]))
pd.Series(cluster_avg_distances).sort_values(ascending=False).plot.bar()
Out[201]:
In [203]:
cluster_max_distances = []
for i in range(num_cluster):
cluster_max_distances.append(np.max(distances[y_cluster == i]))
pd.Series(cluster_max_distances).sort_values(ascending=False).plot.bar()
Out[203]:
In [143]:
pd.Series(df.label[distances>113]).value_counts()
Out[143]:
In [145]:
plt.boxplot(distances);
In [162]:
def outliers(distances):
q1, q3 = np.percentile(distances, [0.25, 0.75])
iqr = q3-q1
upper_whisker = q3 + 1.5 * iqr
lower_whisker = q1 - 1.5 * iqr
return (distances > upper_whisker) | (distances < lower_whisker)
In [166]:
pd.Series(outliers(distances)).value_counts()
Out[166]:
In [122]:
np.sqrt(np.sum((X_pca[y_cluster==i] - centroid)**2, axis = 1)).shape
Out[122]:
In [175]:
X_pca[y_cluster==i].shape
Out[175]:
In [ ]: